* build_var_ocupados.do
* This file modifies the ocupados data by modifying and creating the variables necessary for the regression analysis
* Inputs :/Data/Source/ocupados.dta
* Outputs: /Data/Source/ocupados_mw.dta

* 1. Keep only the cities that appear in all the quarters and known occupations
* 2. "Fix" the weights
* 2.1 Calculate employment rates : Occupied over labor force
* 3. Keep only employed (no selfemployed, familiar), formal and create nominal wage and income variables
* 4. Merge GDP by departamento - Later
* 5. Merge IPC information to calculate real variables
* 6. Calculate real variables. Base= Bogot� 1996 Trimestre 1
* 7. Merge minimum wage and calculate real minimum wages, leads and lags
* 8. Calculate Bartik price variables
* 9. Calculate city specific trends.

*************************** Change log ********************************** 

/*
	01/23/15	JEP		Drop 2001 observation generated when MW is merged
	01/24/15	JEP		Add calculation of mw leads and lags, city specific trends, take these out of build mw measures.
	04/5/2015	JEP		Add calculation of Bartik variables excluding agriculture, mining, public services
*/


clear all
version 13.1

* Project info
cap project, doinfo
if _rc==198 {
	if c(os)=="Unix" loc master "/home/jperez/AA_Minimum_Wage" 
 else if c(username)=="J16339" | c(username)=="A15858" loc master "B:\Col_Minimum_Wage" 
	else loc master "C:/Users/jorpp/Dropbox (Brown)/Col_Minimum_Wage"
 loc pr = 0
}
else {
	local master "`r(pdir)'"
	local doname "`r(dofile)'"  
 loc pr = 1
	* Project calls
project, uses("`master'/Data/Source/ocupados.dta")
project, uses("`master'/Data/Clean/Precios/precios98_c.dta")
project, uses("`master'/Data/Clean/Precios/precios98_ec.dta")
project, uses("`master'/Data/Clean/Precios/precios98_p.dta")
project, uses("`master'/Data/Clean/MW/serie_salario_minimo.dta")
}


use "`master'/Data/Source/ocupados.dta"



* cd "C:/Users/JorgeEduardo/Dropbox/AA_Minimum_Wage"
* use "Data/Source/ocupados.dta", clear

***** 1. Keep only the cities that appear in all the quarters and known occupations

/* levelsof area, local(areas)
levelsof time, local(times)
foreach area in `areas' {
	foreach time in `times'	{
		cap assert area!=`area' if time==`time'
		if !_rc drop if area==`area'
	}
} */
* Muy poquita gente en el Valle de Aburr�
drop if area==6
* Resultado final
keep if inlist(area,5,6,8,11,17,52,68,76)

* Drop unknown occupation
drop if cocupacion==0

***** 2. "Fix" the weights
* This is very heuristic, but that's the way it's done when working with the monthly ECH
* Remember the ocupados files are representative by themselves
* Current weights are to generate quarterly totals.
* For yearly totals, divide by 4
gen fexy=fex/4
* For sample, divide fexy by 5
gen fexs=fexy/5

***** 2.1 Calculate employment rates, age, gender and education composition by city time
* Notice these are calculated over the labor force.
preserve
xi i.cedad
collapse (count) tipo_registro [iw=fex], by(ocupado area time)
ren tipo num
reshape wide num, i(area time) j(ocupado)
gen emp=num3/(num1+num3)
gen empcount=num3
gen lfcount=num1
keep emp empcount lfcount area time
* Save a dataset with employment rates
save "`master'/Data/Source/empleo.dta", replace
restore
merge n:1 area time using "`master'/Data/Source/empleo.dta"
drop _merge 

* Also calculate by industry - time
preserve 
collapse (count) tipo_registro [iw=fex], by(ocupado cactividad_empresa time)
ren tipo num
reshape wide num, i(cactividad_empresa time) j(ocupado)
gen empind=num3/1000
keep empind cactividad_empresa time
save "`master'/Data/Source/empleo_industria.dta", replace
restore
merge n:1 cactividad_empresa time using "`master'/Data/Source/empleo_industria.dta"
drop _merge 


* Now only keep occupied
keep if ocupado==3

***** 3. Code the wage variable
* Drop familiar workers, self-employed, owners, other
* Familiar, cuenta propia, patr�n, otro
* I see no reason to drop domestic workers
* Modification: 75% of domestic workers are below minimum wage! I drop them in some samples.

drop if inlist(tipo_trabajador,1,5,6,7)

* Formality: afiliado salud
* If afiliado then cubierto
replace cubierto_ss=1 if afiliado_salud==1
* Replace afiliado_salud = cubierto for the missing quarters. There is not much discrepancy.
replace afiliado_salud=cubierto if year==1996 & (trim==2 | trim == 4)

* Drop strange values of salario.
* We are going to run quantile regressions which are more robust to outliers, but still, some values make no sense
drop if salario==0 | salario==9 | salario==98
drop if salario_period==.
* Monthly
gen salario_mensual= salario*salario_period
* Rounded monthly
gen salario_mensual_round = round(salario_mensual/1000)*1000
* Weekly
gen salario_semanal= salario_mensual/4
* Hourly
gen salario_hora = salario_semanal / horas_semana if horas_semana!=.
replace salario_hora = salario_semanal / horas_ultima if horas_semana==. & horas_ultima!=.
replace salario_hora = salario_semanal / 40 if horas_semana==. & horas_ultima!=.
* Ingresos
* Ojo que hay unos que se reportan mensuales y otros que se reportan anuales.
* Ver Metadata/frecuencia_ingresos 

foreach x of varlist ing_* otros_ingresos {
	replace `x'=. if `x'==98 | `x'==99
}

gen otros_ingresos_m = otros_ingresos
gen ing_trabajo_m = ing_trabajo
gen ing_intereses_m = ing_intereses
replace ing_intereses_m = ing_intereses_m / 12 if year==2000
gen ing_arriendos_m = ing_arriendos
gen ing_ayudas_m = ing_ayudas
replace ing_ayudas_m = ing_ayudas_m/12 if year==2000
gen ing_otros_m = ing_otros
replace ing_otros_m = ing_otros/12 if year==2000

egen ing_m = rowtotal(ing_*_m otros_ingresos_m), mi

***** 4 - Merge GDP by departamento - Later if needed
merge n:1 area using "`master'/Data/Clean/PIBdept/Departamento_Capital.dta" , nogen
merge n:1 area year using "`master'/Data/Clean/PIBdept/VAdept.dta",keep(match master)
drop _merge 
merge n:1 area year using "`master'/Data/Clean/PIBdept/PIBdept.dta", nogen

***** 5 - Merge price information to calculate real variables
recode estrato (1 2 = 3) (3 5 = 4) (6=1), gen(est)
* No hay IPC para el valle de Aburr� ...
decode area, gen(ciudad)

merge n:1 ciudad est trim year using "`master'/Data/Clean/Precios/precios98_ec.dta" , keep(match master)
* merge n:1 ciudad est trim year using "Data/Clean/Precios/precios98_ec.dta" , keep(match master)
ren _merge _merge3
merge n:1 ciudad trim year using "`master'/Data/Clean/Precios/precios98_c.dta" , keep(match master)
* merge n:1 ciudad trim year using "Data/Clean/Precios/precios98_c.dta" , keep(match master)
ren _merge _merge4
merge n:1 trim year using "`master'/Data/Clean/Precios/precios98_p.dta" , keep(match master)
* merge n:1 trim year using "Data/Clean/Precios/precios98_p.dta" , keep(match master)
ren _merge _merge5


***** 6. Calculate real variables
* Podr�a deflactar los salarios por el ipc en cada ciudad Y CADA ESTRATO
* sum ipc if ciudad=="BOGOT�" & trim==1 & year==1996 & estrato==3, meanonly
* glo base=r(mean)
* gen salario_mensual_real = .
* replace salario_mensual_real=salario_mensual*$base/ipc
* Pero entonces el porcentaje de gente que tiene entre el viejo y el nuevo salario m�nimo no va a ser constante por ciudad, va a variar por estrato. Eso no me sirve. Adem�s no s� si la encuesta es representativa por ciudad-estrato.
* Aunque podr�a sacar el salario m�nimo real por estrato, �C�mo se interpreta eso? Estar�a contaminado por el efecto de los precios dentro de cada estrato.
* Adem�s entonces solo estar�amos hablando de la desigualdad dentro de cada estrato.
* Voy a deflactar por ciudad
* Base= Pa�s 1996 Trimestre 1
* Ver derivaci�n del deflactor en Metadata/deflactor.tex, por si hay confusi�n
sum ipc_p if trim==1 & year==1996, meanonly
glo base=r(mean)
gen deflactor_ciudad=$base/ipc_c
gen salario_mensual_real=salario_mensual*deflactor_ciudad
gen salario_semanal_real=salario_semanal*deflactor_ciudad
gen salario_hora_real=salario_hora*deflactor_ciudad
gen ing_m_real=ing_m*deflactor_ciudad

preserve
glo list ""
forv j=10(10)90 {
		glo list "$list (p`j') p`j'w = salario_mensual_real"
}
collapse (mean) salario_mensual salario_mensual_real deflactor_ciudad $list [iw=fex], by(ciudad time)
save "`master'/Data/Source/wage_ts.dta", replace
restore

**** 7.  Merge minimum wage and calculate real minimum wages
merge n:1 year using "`master'/Data/Clean/MW/serie_salario_minimo.dta"
* merge n:1 year using "Data/Clean/MW/serie_salario_minimo.dta"
drop if _merge==2 & year!=2001
drop if year==2001
ren _merge _merge6

* There are using the country ipc
* Base 1996
gen minw_mensual_real=minw*$base/ipc_p
gen minw_mensual_real_round=minw_round*$base/ipc_p
gen minw_mensual_real_subsidio = minw_subsidio*$base/ipc_p
gen minw_mensual_real_subsidio_round = minw_subsidio_round* $base/ipc_p
gen minw_diario_real=minw_diario*$base/ipc_p
gen minw_semanal_real=minw_diario_real*5
gen minw_hora_real=minw_diario_real/8

/* preserve
collapse (mean) salario_mensual salario_mensual_real (firstnm) minw minw_mensual_real ipc_c, by(ciudad time)
list
* twoway (line salario* year, yaxis(1)) (line minw* year, yaxis(2)) 
line minw_mensual_real time if ciudad=="BOGOT�" ,  xline(`=tq(1998q4)')
restore */

/* preserve
keep if trim==4
collapse (mean) salario_mensual salario_mensual_real (firstnm) minw minw_mensual_real ipc_c, by(ciudad year)
list
* twoway (line salario* year, yaxis(1)) (line minw* year, yaxis(2)) if ciudad=="BOGOT�"
line minw_mensual_real year if ciudad=="BOGOT�", xline(1998)
restore */


* Calculate leads and lags of the real minimum wage by city
preserve
collapse (firstnm) minw minw_round minw_mensual_real minw_mensual_real_round ipc_c, by(ciudad time)
encode ciudad, gen(ccode)
xtset ccode time
foreach x in minw minw_round minw_mensual_real minw_mensual_real_round {
	gen `x'_f1 = f.`x'
	gen `x'_l1 = l.`x'
}
list
tempfile minws
save `minws'
restore
merge n:1 ciudad time using `minws'
drop _merge


* 8. Calculate Bartik price variables

preserve

gen lw=log(salario_mensual_real)

* Calculate mean wages and employment shares by city industry
collapse (count) count=salario_mensual_real (mean) salario=lw [iw=fex], by(ciudad cactividad time)
* Redistribute missing according to employment shares
* Generate employment shares
bys ciudad time: egen totemp=total(count) if cactividad!=.
gen share=count/totemp

* Get the number of employees in missing, to be assigned
bys ciudad time (cactividad): gen toassign=count[_N]
replace toassign=. if cactividad!=.
* Propagate to all industries
bys ciudad time: egen ta=max(toassign)
* Add ta to each industry according to the share, round to get integers
replace count=round(count+share*ta) if ta!=.

* Recalculate total and employement shares to account for rounding error
drop totemp share
drop if cactividad==.
bys ciudad time: egen totemp=total(count) if cactividad!=.
bys ciudad time: egen totemp_ex=total(count) if cactividad!=. & !inlist(cactividad,13,29,42)
* Get a tempfile here with the employment counts per industry to be merged back
tempfile ie
save `ie'

gen share=count/totemp
gen share_ex=count/totemp_ex
replace share_ex=. if inlist(cactividad,13,29,42)

drop toassign ta

* For each city and industry at a given time, calculate average industry wages over the cities excluding the city considered
* This is sort of a clumsy code ...
gen wage=.
gen emp=.
levelsof ciudad, local(c)
levelsof cactividad, local(i)
levelsof time, local(t)
foreach city in `c' {
	foreach ind in `i' {
		cap drop meanw tempp
		bys time: egen meanw=mean(salario) if cactividad==`ind' & ciudad!="`city'"
		bys time: egen tempp=total(count) if cactividad==`ind' & ciudad!="`city'"
		foreach tt in `t' {
			sum meanw if time==`tt', meanonly
			replace wage=r(mean) if cactividad==`ind' & ciudad=="`city'" & time==`tt'
			sum tempp if time==`tt', meanonly
			replace emp=r(sum) if cactividad==`ind' & ciudad=="`city'" & time==`tt'
		}
	}
}

drop meanw tempp

* Multiply share in t by wage_t+1 - wage_t for each industry within each city at each time
sort ciudad cactividad time
gen prodp=share[_n-1]*(wage[_n] -wage[_n-1]) if time!=tq(1996q1)
gen prodq=share[_n-1]*(emp[_n] -emp[_n-1]) if time!=tq(1996q1)
* Excluding some industries
gen prodp_ex=share_ex[_n-1]*(wage[_n] -wage[_n-1]) if time!=tq(1996q1)
gen prodq_ex=share_ex[_n-1]*(emp[_n] -emp[_n-1]) if time!=tq(1996q1)
* Sum this by city time
collapse (sum) bartikp1=prodp bartikq=prodq bartikp1_ex=prodp_ex bartikq_ex=prodq_ex  , by(ciudad time)
foreach var in bartikp1 bartikq bartikp1_ex bartikq_ex {
	replace `var'=. if time==tq(1996q1)
}




* Merge back
tempfile tom
save `tom'
restore

merge n:1 ciudad time using `tom'
drop _merge
* merge n:1 ciudad cactividad time using `ie', keepusing(empind)
* drop _merge


* 9. Calculate City specific trends

	
preserve
bys ciudad time: keep if _n==1
sort ciudad time
bys ciudad : gen trend=_n
keep ciudad time trend
tempfile trend
save `trend'
restore
merge n:1 ciudad time using `trend'
drop _merge


save "`master'/Data/Source/ocupados_mw.dta", replace


* Project calls
if `pr' {
	project, relies_on("`master'/Metadata/deflactor.tex")
	project, creates("`master'/Data/Source/ocupados_mw.dta")
	project, creates("`master'/Data/Source/wage_ts.dta")
	project, creates("`master'/Data/Source/empleo.dta")
}










		
	